# === setup === #
library(here)
source(here("GitControlled/Codes/0_1_ls_packages.R"))
source(here("GitControlled/Codes/0_0_functions.R"))

# /*===== Data =====*/
reg_data <- 
  here("Shared/Data/WaterAnalysis/comp_reg_dt.rds") %>%
  readRDS() %>%
  .[year %in% 2008:2012 & usage <= 40,] %>%
  .[,tr_year:=factor(paste0(tr,year))] %>%
  .[,trs_year:=factor(paste0(trs,year))]
  • NOTE
    • When aggregating the data, we need to carefully deal with the data related to a township located in Gosper county in TB (tr=="5_22). This township is designated as a phase 3 groundwater quantity management area which requires allocation of pumping. Groundwater pumping was limited to a total of 27 inches-acre for 2009-2011.
# reg_data[tr=="5_22", ]
# reg_data[tr=="5_22", wellid] %>% unique() %>% length()

agg_reg_data <-
  copy(reg_data) %>%
  .[tr!="5_22",] %>%
  .[,.(
    sum_usage = sum(usage),
    mean_usage = mean(usage),
    treat2 = mean(treat2),
    # --- soil --- #
    silttotal_r = mean(silttotal_r),
    claytotal_r = mean(claytotal_r),
    slope_r = mean(slope_r),
    ksat_r = mean(ksat_r),
    awc_r = mean(awc_r),
    # --- weather --- #    
    sum_pr_in = sum(pr_in),
    mean_pr_in = mean(pr_in),
    sum_pet_in = sum(pet_in),
    mean_pet_in = mean(pet_in),
    sum_gdd_in = sum(gdd_in),
    mean_gdd_in = mean(gdd_in),
    # --- tr --- #
    tr = tr
    ),by=wellid] %>%
  unique(.,by="wellid")

1 Well-year level Data

# /*===== control variables =====*/
# remove gdd_in
cov_ls <- c(
  # --- weather --- #
  "pr_in","pet_in",
  # --- soil --- #
  "silttotal_r", "claytotal_r", "slope_r", "ksat_r", "awc_r"
  )

data <- reg_data
Y <- data[, usage]
T <- data[, treat2]
X <- data[, ..cov_ls]
cl_var <- data[,tr_year]

#/*--------------------------------*/
#' ## 1st CF
#/*--------------------------------*/
set.seed(23456)
forest_W <- regression_forest(X, T, num.trees = 4000, clusters = cl_var)
W_hat <- predict(forest_W)$predictions

forest_Y <- regression_forest(X, Y, num.trees = 4000, clusters = cl_var)
Y_hat <- predict(forest_Y)$predictions

cf_raw <- causal_forest(
  X=X,
  Y=Y,
  W=T, 
  # Y.hat = Y_hat, 
  # W.hat = W_hat,
  clusters = cl_var,
  tune.parameters = "all",
  num.trees = 4000
)

varimp = variable_importance(cf_raw)
selected_vars = which(varimp > mean (varimp))
# selected_vars = which(varimp/mean(varimp) > 0.2)
se_cov <- data[,cov_ls[selected_vars], with=FALSE]

vis_cf_raw <- gen_impact_viz(
  cf_res= cf_raw,
  data_base=data,
  treat_var='treat2',
  var_ls= cov_ls,
  var_ls_int = cov_ls
)
vis_cf_raw


2 Well level Data

2.1 Sum

cov_ls <- c(
  # --- weather --- #
  "sum_pr_in","sum_pet_in",
  # --- soil --- #
  "silttotal_r", "claytotal_r", "slope_r", "ksat_r", "awc_r"
  )

data <- agg_reg_data
Y <- data[, sum_usage]
T <- data[, treat2]
X <- data[, ..cov_ls]
cl_var <- data[,tr] %>% factor()

#/*--------------------------------*/
#' ## 1st CF
#/*--------------------------------*/
set.seed(23456)
forest_W <- regression_forest(X, T, num.trees = 4000, clusters = cl_var)
W_hat <- predict(forest_W)$predictions

forest_Y <- regression_forest(X, Y, num.trees = 4000, clusters = cl_var)
Y_hat <- predict(forest_Y)$predictions

cf_raw <- causal_forest(
  X=X,
  Y=Y,
  W=T, 
  Y.hat = Y_hat, 
  W.hat = W_hat,
  clusters = cl_var,
  tune.parameters = "all",
  num.trees = 4000
)

varimp = variable_importance(cf_raw)
selected_vars = which(varimp > mean (varimp))
# selected_vars = which(varimp/mean(varimp) > 0.2)
se_cov <- data[,cov_ls[selected_vars], with=FALSE]

vis_cf_raw <- gen_impact_viz(
  cf_res= cf_raw,
  data_base=data,
  treat_var='treat2',
  var_ls= cov_ls,
  var_ls_int = cov_ls
)
vis_cf_raw

2.2 Mean

cov_ls <- c(
  # --- weather --- #
  "mean_pr_in","mean_pet_in",
  # --- soil --- #
  "silttotal_r", "claytotal_r", "slope_r", "ksat_r", "awc_r"
  )

data <- agg_reg_data
Y <- data[, mean_usage]
T <- data[, treat2]
X <- data[, ..cov_ls]
cl_var <- data[,tr] %>% factor()

#/*--------------------------------*/
#' ## 1st CF
#/*--------------------------------*/
set.seed(23456)
forest_W <- regression_forest(X, T, num.trees = 4000, clusters = cl_var)
W_hat <- predict(forest_W)$predictions

forest_Y <- regression_forest(X, Y, num.trees = 4000, clusters = cl_var)
Y_hat <- predict(forest_Y)$predictions

cf_raw <- causal_forest(
  X=X,
  Y=Y,
  W=T, 
  Y.hat = Y_hat, 
  W.hat = W_hat,
  clusters = cl_var,
  tune.parameters = "all",
  num.trees = 4000
)

varimp = variable_importance(cf_raw)
selected_vars = which(varimp > mean (varimp))
# selected_vars = which(varimp/mean(varimp) > 0.2)
se_cov <- data[,cov_ls[selected_vars], with=FALSE]

vis_cf_raw <- gen_impact_viz(
  cf_res= cf_raw,
  data_base=data,
  treat_var='treat2',
  var_ls= cov_ls,
  var_ls_int = cov_ls
)
vis_cf_raw